In [22]:
import pandas as pd
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
In [5]:
bank_marketing_csv = "data/bank_marketing/bank-full.csv"
In [6]:
bank_data = pd.read_csv(bank_marketing_csv, sep=";")
In [7]:
bank_data
Out[7]:
age job marital education default balance housing loan contact day month duration campaign pdays previous poutcome y
0 58 management married tertiary no 2143 yes no unknown 5 may 261 1 -1 0 unknown no
1 44 technician single secondary no 29 yes no unknown 5 may 151 1 -1 0 unknown no
2 33 entrepreneur married secondary no 2 yes yes unknown 5 may 76 1 -1 0 unknown no
3 47 blue-collar married unknown no 1506 yes no unknown 5 may 92 1 -1 0 unknown no
4 33 unknown single unknown no 1 no no unknown 5 may 198 1 -1 0 unknown no
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
45206 51 technician married tertiary no 825 no no cellular 17 nov 977 3 -1 0 unknown yes
45207 71 retired divorced primary no 1729 no no cellular 17 nov 456 2 -1 0 unknown yes
45208 72 retired married secondary no 5715 no no cellular 17 nov 1127 5 184 3 success yes
45209 57 blue-collar married secondary no 668 no no telephone 17 nov 508 4 -1 0 unknown no
45210 37 entrepreneur married secondary no 2971 no no cellular 17 nov 361 2 188 11 other no

45211 rows × 17 columns

In [8]:
init_report = ProfileReport(bank_data)
In [9]:
init_report.to_file(f'bank_marketing_EDA.html')
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
  0%|                                                                                           | 0/17 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 50.52it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]
In [10]:
init_report
Out[10]:

In [11]:
numerical_bank_data = bank_data[["balance", "duration", "campaign", "pdays", "previous"]]
In [12]:
numerical_bank_data
Out[12]:
balance duration campaign pdays previous
0 2143 261 1 -1 0
1 29 151 1 -1 0
2 2 76 1 -1 0
3 1506 92 1 -1 0
4 1 198 1 -1 0
... ... ... ... ... ...
45206 825 977 3 -1 0
45207 1729 456 2 -1 0
45208 5715 1127 5 184 3
45209 668 508 4 -1 0
45210 2971 361 2 188 11

45211 rows × 5 columns

In [13]:
numerical_bank_data.describe()
Out[13]:
balance duration campaign pdays previous
count 45211.000000 45211.000000 45211.000000 45211.000000 45211.000000
mean 1362.272058 258.163080 2.763841 40.197828 0.580323
std 3044.765829 257.527812 3.098021 100.128746 2.303441
min -8019.000000 0.000000 1.000000 -1.000000 0.000000
25% 72.000000 103.000000 1.000000 -1.000000 0.000000
50% 448.000000 180.000000 2.000000 -1.000000 0.000000
75% 1428.000000 319.000000 3.000000 -1.000000 0.000000
max 102127.000000 4918.000000 63.000000 871.000000 275.000000
In [ ]:
 
In [43]:
std_scaler = StandardScaler().fit(numerical_bank_data)
minmax_scaler = MinMaxScaler().fit(numerical_bank_data)
robust_scaler = RobustScaler().fit(numerical_bank_data)

std_data = std_scaler.transform(numerical_bank_data)
minmax_data = minmax_scaler.transform(numerical_bank_data)
robust_data = robust_scaler.transform(numerical_bank_data)

std_df = pd.DataFrame(std_data, columns=["balance", "duration", "campaign", "pdays", "previous"])
minmax_df = pd.DataFrame(minmax_data, columns=["balance", "duration", "campaign", "pdays", "previous"])
robust_df = pd.DataFrame(robust_data, columns=["balance", "duration", "campaign", "pdays", "previous"])
In [47]:
std_df.describe()
Out[47]:
balance duration campaign pdays previous
count 4.521100e+04 4.521100e+04 4.521100e+04 4.521100e+04 4.521100e+04
mean 1.760208e-17 6.035001e-17 3.017500e-17 2.011667e-17 4.023334e-17
std 1.000011e+00 1.000011e+00 1.000011e+00 1.000011e+00 1.000011e+00
min -3.081149e+00 -1.002478e+00 -5.693506e-01 -4.114531e-01 -2.519404e-01
25% -4.237719e-01 -6.025167e-01 -5.693506e-01 -4.114531e-01 -2.519404e-01
50% -3.002800e-01 -3.035165e-01 -2.465603e-01 -4.114531e-01 -2.519404e-01
75% 2.158743e-02 2.362370e-01 7.622994e-02 -4.114531e-01 -2.519404e-01
max 3.309478e+01 1.809470e+01 1.944365e+01 8.297431e+00 1.191360e+02
In [28]:
fig, axes = plt.subplots(3,2, figsize=(10,15))

axes[0,0].hist(numerical_bank_data["balance"], bins=10)
axes[0,0].set_title("Balance")

axes[0,1].hist(numerical_bank_data["duration"])
axes[0,1].set_title("Duration")

axes[1,0].hist(numerical_bank_data["campaign"])
axes[1,0].set_title("Campaign")

axes[1,1].hist(numerical_bank_data["pdays"])
axes[1,1].set_title("Pdays")

axes[2,0].hist(numerical_bank_data["previous"])
axes[2,0].set_title("Previous")

plt.show()
No description has been provided for this image
In [44]:
fig, axes = plt.subplots(3,2, figsize=(10,15))

axes[0,0].hist(std_df["balance"], bins=10)
axes[0,0].set_title("Balance")

axes[0,1].hist(std_df["duration"])
axes[0,1].set_title("Duration")

axes[1,0].hist(std_df["campaign"])
axes[1,0].set_title("Campaign")

axes[1,1].hist(std_df["pdays"])
axes[1,1].set_title("Pdays")

axes[2,0].hist(std_df["previous"])
axes[2,0].set_title("Previous")

plt.show()
No description has been provided for this image
In [45]:
fig, axes = plt.subplots(3,2, figsize=(10,15))

axes[0,0].hist(minmax_df["balance"], bins=10)
axes[0,0].set_title("Balance")

axes[0,1].hist(minmax_df["duration"])
axes[0,1].set_title("Duration")

axes[1,0].hist(minmax_df["campaign"])
axes[1,0].set_title("Campaign")

axes[1,1].hist(minmax_df["pdays"])
axes[1,1].set_title("Pdays")

axes[2,0].hist(minmax_df["previous"])
axes[2,0].set_title("Previous")

plt.show()
No description has been provided for this image
In [46]:
fig, axes = plt.subplots(3,2, figsize=(10,15))

axes[0,0].hist(robust_df["balance"], bins=10)
axes[0,0].set_title("Balance")

axes[0,1].hist(robust_df["duration"])
axes[0,1].set_title("Duration")

axes[1,0].hist(robust_df["campaign"])
axes[1,0].set_title("Campaign")

axes[1,1].hist(robust_df["pdays"])
axes[1,1].set_title("Pdays")

axes[2,0].hist(robust_df["previous"])
axes[2,0].set_title("Previous")

plt.show()
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: